package org.hscoder.websearcher.dytt;

import com.fasterxml.jackson.core.type.TypeReference;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.DateUtils;
import org.hscoder.websearcher.domain.FilmData;
import org.hscoder.websearcher.util.JsonUtil;
import org.hscoder.websearcher.util.ThreadUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.text.ParseException;
import java.util.*;
import java.util.stream.Collectors;

/**
 * 电影条目信息抓取程序
 */
public class DyttDetailsFetcher {

    private static final Logger logger = LoggerFactory.getLogger(DyttDetailsFetcher.class);

    /**
     * 从指定链接的页面中抓取电影条目详情
     *
     * @param detailUrl
     * @return
     */
    private static FilmData fetchDetail(String detailUrl) {
        try {
            Document doc = Jsoup.parse(new URL(detailUrl), 15000);

            //获取内容元素
            Element contentNode = doc.selectFirst("#Zoom > span");
            if (contentNode == null) {
                return null;
            }

            //取第一个<p> 元素
            if (contentNode.child(0).is("p")) {
                contentNode = contentNode.child(0);
            }
            //或者以<div class="tpc_content">
            else if (contentNode.child(0).is("div.tpc_content")) {
                contentNode = contentNode.child(0);
            }

            //清理下载链接，包括strong和table元素
            Elements strongElems = contentNode.select("strong");
            if (strongElems != null) {
                strongElems.stream().forEach(e -> {
                    e.remove();
                });
            }

            Elements tableElems = contentNode.select("table");
            if (tableElems != null) {
                tableElems.stream().forEach(e -> {
                    e.remove();
                });
            }

            //创建电影条目对象
            FilmData film = new FilmData();
            film.setUrl(detailUrl);

            //抽取图片元素
            Element firstImgNode = contentNode.selectFirst("img");
            if (firstImgNode != null) {
                film.setCoverImgUrl(firstImgNode.attr("src"));
                firstImgNode.remove();
            }

            contentNode.select("img").stream().forEach(img -> {
                film.addCaptureImgUrl(img.attr("src"));
            });

            //获得详情的文本行，执行解析
            List<String> lines = contentNode.textNodes().stream().map(t -> t.text().trim()).collect(Collectors.toList());

            parseLines(lines, film);
            logger.info("parsed film {} from {}", film.getName(), detailUrl);

            return film;
        } catch (Exception e) {
            logger.error("fetch detai; failed at {}", detailUrl, e);
            return null;
        }
    }

    /**
     * 解析页面中的多行文本，填充到电影条目信息对象中
     *
     * @param lines
     * @param film
     */
    private static void parseLines(List<String> lines, FilmData film) {

        int i = 0;
        while (i < lines.size()) {
            String line = lines.get(i).trim();

            //电影名称
            if (line.startsWith("◎译　　名")) {
                film.setName(extractOneOf("◎译　　名", line, 0, "/"));
            } else if (line.startsWith("◎片　　名")) {
                film.setLocalName(extractOneOf("◎片　　名", line, 0, "/"));
            }
            //年代
            else if (line.startsWith("◎年　　代")) {
                film.setPeriod(filterPeriod(extract("◎年　　代", line)));
            }
            //区域
            else if (line.startsWith("◎产　　地")) {
                film.setRegions(filterRegions(extractMulti("◎产　　地", line, "/")));
            } else if (line.startsWith("◎国　　家")) {
                film.setRegions(filterRegions(extractMulti("◎国　　家", line, "/")));
            } else if (line.startsWith("◎地　　区")) {
                film.setRegions(filterRegions(extractMulti("◎地　　区", line, "/")));
            }
            //类别信息
            else if (line.startsWith("◎类　　别")) {
                film.setSorts(filterSorts(extractMulti("◎类　　别", line, "/")));
            }
            //语言
            else if (line.startsWith("◎语　　言")) {
                film.setLanguage(filterLanguage(extractOneOf("◎语　　言", line, 0, "/")));
            }
            //...
            else if (line.startsWith("◎上映日期")) {
                String dateStr = extractOneOf("◎上映日期", line, 0, "\\(");
                try {
                    film.setReleaseDate(DateUtils.parseDate(dateStr, new String[]{"yyyy-MM-dd"}));
                } catch (ParseException e) {
                    logger.warn("parse release date '{}' faield ", line);
                    film.setReleaseDate(new Date());
                }
            } else if (line.startsWith("◎豆瓣评分")) {
                String scoreStr = extractOneOf("◎豆瓣评分", line, 0, "/");
                try {
                    film.setScore(Double.parseDouble(scoreStr));
                } catch (NumberFormatException e) {
                    logger.warn("parse score '{}' faield ", line);
                    film.setScore(5.0);
                }
            } else if (line.toUpperCase().startsWith("◎IMDB评分")) {
                String scoreStr = extractOneOf("◎IMDB评分", line, 0, "/");
                try {
                    film.setScore(Double.parseDouble(scoreStr));
                } catch (NumberFormatException e) {
                    logger.warn("parse score '{}' faield ", line);
                    film.setScore(5.0);
                }
            } else if (line.startsWith("◎片　　长")) {
                film.setDuring(extract("◎片　　长", line));
            } else if (line.startsWith("◎导　　演")) {
                film.setDirector(extract("◎导　　演", line));
            } else if (line.startsWith("◎编　　剧")) {
                film.setScriptwriter(extract("◎编　　剧", line));
            } else if (line.startsWith("◎标　　签")) {
                film.setTags(extractMulti("◎标　　签", line, "\\|"));
            } else if (line.startsWith("◎主　　演")) {
                film.addActor(extract("◎主　　演", line));

                //逐段解析
                int j = i + 1;
                while (j < lines.size()) {
                    String testLine = lines.get(j);
                    if (testLine.startsWith("◎")) {
                        break;
                    }
                    film.addActor(trimBlank(testLine));
                    j++;
                }
                i = j - 1;
            } else if (line.startsWith("◎标　　签")) {
                film.setTags(extractMulti("◎标　　签", line, "\\|"));
            } else if (line.startsWith("◎简　　介")) {

                List<String> introLines = new ArrayList<>();
                String iIntro = extract("◎简　　介", line);
                if (!StringUtils.isEmpty(iIntro)) {
                    introLines.add(iIntro);
                }

                //逐段解析
                int j = i + 1;
                while (j < lines.size()) {
                    String testLine = lines.get(j);
                    if (testLine.startsWith("◎")) {
                        break;
                    }
                    if (!StringUtils.isEmpty(testLine)) {
                        introLines.add(trimBlank(testLine));
                    }
                    j++;
                }
                i = j - 1;

                film.setIntro(StringUtils.join(introLines, "\n"));
            } else {
            }
            i++;
        }
    }


    //对于存在对个值的情况，仅抽取某一个
    private static String extractOneOf(String label, String line, int idx, String sep) {
        List<String> multi = extractMulti(label, line, sep);
        if (multi.size() > idx) {
            return multi.get(idx);
        }
        return extract(label, line);
    }
    //抽取多个值
    private static List<String> extractMulti(String label, String line, String sep) {
        String out = extract(label, line);
        return Arrays.asList(out.split(sep)).stream().map(s -> s.trim()).collect(Collectors.toList());
    }

    //抽取一个文本
    private static String extract(String label, String line) {
        return trimBlank(line.substring(label.length()));
    }

    //去除空白字符
    private static String trimBlank(String text) {
        return text.replace((char) 12288, ' ').trim();
    }

    //处理区域字段
    private static List<String> filterRegions(List<String> regions) {

        List<String> results = new ArrayList<>();
        regions.stream().forEach(r -> {
            String r1 = r;

            if (r1.contains("Poland")) {
                r1 = "波兰";
            }
            results.add(r1);
        });
        return results;
    }

    //处理时代字段
    private static String filterPeriod(String period) {
        if (period.length() > 4) {
            String year = period.substring(0, 4);
            return year;
        }
        return period;
    }

    //处理语言字段
    private static String filterLanguage(String language) {

        if (language.contains("Polish") || language.contains("Poland")) {
            return "波兰语";
        }
        if (language.contains("北印度语") || language.contains("印地语") || language.contains("泰米尔语") || language.contains("泰卢固语")) {
            return "印度语";
        }
        if (language.contains("印度尼西亚语")) {
            return "印尼语";
        }
        if (language.contains("英文")) {
            return "英语";
        }
        if (language.contains("普通")) {
            return "国语";
        }
        return language;
    }

    //处理类别字段
    private static List<String> filterSorts(List<String> sorts) {

        List<String> results = new ArrayList<>();
        sorts.stream().forEach(r -> {
            String r1 = r;

            if (r1.contains("纪录片") || r1.contains("记录")) {
                r1 = "纪录";
            }
            if (r1.contains("Poland")) {
                r1 = "波兰";
            }
            results.add(r1);
        });
        return results;
    }

    /**
     * 根据链接信息，抓取全部电影条目信息
     *
     * @param linksPath
     * @throws IOException
     */
    private static void batchFetchDetails(String linksPath) throws IOException {
        //解析链接信息

        File linksFile = new File(linksPath);
        String linksContent = FileUtils.readFileToString(linksFile, "UTF-8");
        Map<String, List<String>> results = JsonUtil.fromJson(linksContent, new TypeReference<Map<String, List<String>>>() { });

        //详情信息写入 linksPath + /detail 子目录
        File dir = new File(linksFile.getParent(), "detail");
        dir.mkdirs();

        //遍历每个页面，每个链接，进行详情抓取
        results.entrySet().stream().forEach(e -> {

            String page = e.getKey();
            List<String> filmLinks = e.getValue();

            List<FilmData> films = new ArrayList<>();
            filmLinks.stream().forEach(me -> {

                String url = me;

                //抓取详情
                FilmData film = fetchDetail(url);
                if (film != null) {
                    films.add(film);
                }

                ThreadUtil.waitSome(200);
            });

            //将每一列表页对应的电影信息写入到文件
            String pageFilmContent = JsonUtil.toPrettyJson(films);
            try {
                FileUtils.write(new File(dir, page), pageFilmContent, "UTF-8");

                logger.info("save page {}.", page);

            } catch (IOException e1) {
                logger.error("write film data failed", e);
            }

            ThreadUtil.waitSome(200);
        });
    }

    public static void main(String[] args) throws IOException {
//        FilmData film = fetchDetail("http://www.ygdy8.net/html/gndy/dyzz/20150131/47240.html");
//        System.out.println(JsonUtil.toPrettyJson(film));

        batchFetchDetails("D:\\temp\\dytt\\dytt.links");
    }

}
